Goal Intensities

Try to model goal frequencies according to Poisson processes based on team's ranks.


In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [12]:
# Get World Cup data
import json

with open('data/all_matches.json', 'r') as f:
    all_match_data = json.load(f)

In [13]:
all_match_data[0]


Out[13]:
{u'away_team': {u'code': u'CRO', u'country': u'Croatia', u'goals': 1},
 u'away_team_events': [{u'id': 677,
   u'player': u'Brozovi\u0106',
   u'time': u'61',
   u'type_of_event': u'substitution-in'},
  {u'id': 674,
   u'player': u'Corluka',
   u'time': u'66',
   u'type_of_event': u'yellow-card'},
  {u'id': 675,
   u'player': u'Lovren',
   u'time': u'69',
   u'type_of_event': u'yellow-card'},
  {u'id': 676,
   u'player': u'Rebi\u0106',
   u'time': u'78',
   u'type_of_event': u'substitution-in'}],
 u'datetime': u'2014-06-12T17:00:00.000-03:00',
 u'home_team': {u'code': u'BRA', u'country': u'Brazil', u'goals': 3},
 u'home_team_events': [{u'id': 662,
   u'player': u'Marcelo',
   u'time': u'11',
   u'type_of_event': u'goal-own'},
  {u'id': 665,
   u'player': u'Neymar Jr',
   u'time': u'27',
   u'type_of_event': u'yellow-card'},
  {u'id': 666,
   u'player': u'Neymar Jr',
   u'time': u'29',
   u'type_of_event': u'goal'},
  {u'id': 664,
   u'player': u'Paulinho',
   u'time': u'63',
   u'type_of_event': u'substitution-out'},
  {u'id': 672,
   u'player': u'Hernanes',
   u'time': u'63',
   u'type_of_event': u'substitution-in'},
  {u'id': 663,
   u'player': u'Hulk',
   u'time': u'68',
   u'type_of_event': u'substitution-out'},
  {u'id': 673,
   u'player': u'Bernard',
   u'time': u'68',
   u'type_of_event': u'substitution-in'},
  {u'id': 667,
   u'player': u'Neymar Jr',
   u'time': u'71',
   u'type_of_event': u'goal-penalty'},
  {u'id': 670,
   u'player': u'L Gustavo',
   u'time': u'88',
   u'type_of_event': u'yellow-card'},
  {u'id': 668,
   u'player': u'Neymar Jr',
   u'time': u'88',
   u'type_of_event': u'substitution-out'},
  {u'id': 671,
   u'player': u'Ramires',
   u'time': u'88',
   u'type_of_event': u'substitution-in'},
  {u'id': 669,
   u'player': u'Oscar',
   u'time': u'901',
   u'type_of_event': u'goal'}],
 u'location': u'Arena de Sao Paulo',
 u'match_number': 1,
 u'status': u'completed',
 u'winner': u'Brazil',
 u'winner_code': u'BRA'}

In [25]:
# Load into a pandas dataframe
import pandas as pd

# first make a simpler dataset
home_simple = [dict(game['home_team'].items() + {'home':1}.items()) for i, game in enumerate(all_match_data)]
away_simple = [dict(game['away_team'].items() + {'home':0}.items()) for j, game in enumerate(all_match_data)]
simple = home_simple + away_simple
simple_alt = away_simple + home_simple

In [31]:
# Old method with 
temp_df = pd.DataFrame(simple)
matches_df = pd.DataFrame(simple_alt).join(temp_df, lsuffix='_1', rsuffix='_2').fillna(0)

matches_df.head()


Out[31]:
code_1 country_1 goals_1 home_1 penalties_1 code_2 country_2 goals_2 home_2 penalties_2
0 CRO Croatia 1 0 0 BRA Brazil 3 1 0
1 CMR Cameroon 0 0 0 MEX Mexico 1 1 0
2 NED Netherlands 5 0 0 ESP Spain 1 1 0
3 AUS Australia 1 0 0 CHI Chile 3 1 0
4 GRE Greece 0 0 0 COL Colombia 3 1 0

5 rows × 10 columns


In [32]:
matches_df[64:69]


Out[32]:
code_1 country_1 goals_1 home_1 penalties_1 code_2 country_2 goals_2 home_2 penalties_2
64 BRA Brazil 3 1 0 CRO Croatia 1 0 0
65 MEX Mexico 1 1 0 CMR Cameroon 0 0 0
66 ESP Spain 1 1 0 NED Netherlands 5 0 0
67 CHI Chile 3 1 0 AUS Australia 1 0 0
68 COL Colombia 3 1 0 GRE Greece 0 0 0

5 rows × 10 columns


In [33]:
matches_df.describe()


Out[33]:
goals_1 home_1 penalties_1 goals_2 home_2 penalties_2
count 128.000000 128.000000 128.000000 128.000000 128.000000 128.000000
mean 1.328125 0.500000 0.203125 1.328125 0.500000 0.203125
std 1.292677 0.501965 0.826332 1.292677 0.501965 0.826332
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 0.500000 0.000000 1.000000 0.500000 0.000000
75% 2.000000 1.000000 0.000000 2.000000 1.000000 0.000000
max 7.000000 1.000000 5.000000 7.000000 1.000000 5.000000

8 rows × 6 columns


In [39]:
# Let's look at a histogram of values for goals
matches_df['goals_1'].hist(bins=8)
show()



In [52]:
# Clearly teams usually score once.
# If we simply fit a poisson distribution to this data, what value would we use?
# We'll answer this question by seeing which Poisson distribution best fits the data
from scipy.stats import poisson

# def nllf(l):
#     result = 0.
#     for goal in matches_df['goals_1']:
#         result += poisson(l).logpmf(goal)
#     return -result

# One liner
nllf = lambda l: -poisson(exp(l)).logpmf(matches_df['goals_1']).sum()

nllf(1)


Out[52]:
252.1465755469828

In [53]:
# Optimize!
from scipy.optimize import minimize

result = minimize(nllf, [0])
result


Out[53]:
   status: 0
  success: True
     njev: 9
     nfev: 27
 hess_inv: array([[ 0.00587375]])
      fun: 195.96591207201465
        x: array([ 0.28376822])
  message: 'Optimization terminated successfully.'
      jac: array([  3.81469727e-06])

In [54]:
simple_rate = exp(result.x)
simple_rate


Out[54]:
array([ 1.32812506])

In [68]:
goal_hist = matches_df['goals_1'].value_counts().to_dict()

In [80]:
# See how this overlays

x = array(range(8))
width = 0.33
bar(x, [goal_hist.get(i, 0) for i in x], width, alpha=0.5, label='Actual')
bar(x + width, poisson(simple_rate).pmf(x) * len(matches_df['goals_1']),
    width, color='r', alpha=0.5, label='Simple Poisson')
legend(loc='best')
show()